In [1]:
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#%load_ext autoreload
#%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

This notebook matches Current Population Survey data from https://cps.ipums.org/cps/ with Social Class information from https://en.wikipedia.org/wiki/Household_income_in_the_United_States


In [4]:
pwd


Out[4]:
'C:\\Users\\gary\\Documents\\data'

In [3]:
cd C:\Users\gary\Documents\data


C:\Users\gary\Documents\data

In [5]:
ls


 Volume in drive C is OS
 Volume Serial Number is F8CA-7CC7

 Directory of C:\Users\gary\Documents\data

07/02/2017  04:16 PM    <DIR>          .
07/02/2017  04:16 PM    <DIR>          ..
07/02/2017  08:15 AM    <DIR>          atusact_2016
07/01/2017  09:25 AM         3,684,229 atusact_2016.zip
07/02/2017  08:15 AM    <DIR>          atuscps_2016
07/01/2017  09:26 AM         6,860,353 atuscps_2016.zip
07/02/2017  09:28 AM           864,474 atuscpscodebk16.pdf
07/02/2017  09:29 AM           570,216 atusintcodebk16.pdf
07/01/2017  09:35 AM    <DIR>          atusresp_2016
07/01/2017  09:25 AM           841,679 atusresp_2016.zip
07/02/2017  08:15 AM    <DIR>          atusrost_2016
07/01/2017  09:25 AM           144,334 atusrost_2016.zip
07/02/2017  08:17 AM    <DIR>          atusrostec_2016
07/01/2017  09:26 AM            23,198 atusrostec_2016.zip
07/02/2017  08:17 AM    <DIR>          atussum_2016
07/01/2017  09:25 AM           713,076 atussum_2016.zip
07/02/2017  08:17 AM    <DIR>          atuswho_2016
07/01/2017  09:25 AM           841,409 atuswho_2016.zip
07/02/2017  04:16 PM    <DIR>          cps_00001.csv
07/02/2017  04:15 PM         1,215,296 cps_00001.csv.gz
07/02/2017  09:26 AM            70,656 hinc01_1.xls
              11 File(s)     15,828,920 bytes
              10 Dir(s)  669,635,338,240 bytes free

In [6]:
filename = 'cps_00001.csv/cps_00001.csv'
resp = pd.read_csv(filename)
resp.describe()


Out[6]:
YEAR SERIAL HWTSUPP ASECFLAG HHINCOME WTSUPP
count 185487.0 185487.000000 185487.000000 185487.0 1.854870e+05 185487.000000
mean 2016.0 48751.964774 1679.426439 1.0 9.119382e+04 1719.088077
std 0.0 27496.014576 1022.340851 0.0 9.988669e+04 1065.911059
min 2016.0 2.000000 101.190000 1.0 -9.999000e+03 69.880000
25% 2016.0 24557.000000 834.760000 1.0 3.510700e+04 846.260000
50% 2016.0 49263.000000 1640.200000 1.0 6.900000e+04 1651.990000
75% 2016.0 73243.000000 2133.940000 1.0 1.165530e+05 2218.350000
max 2016.0 94097.000000 8148.970000 1.0 2.289913e+06 12515.280000

In [7]:
resp.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185487 entries, 0 to 185486
Data columns (total 6 columns):
YEAR        185487 non-null int64
SERIAL      185487 non-null int64
HWTSUPP     185487 non-null float64
ASECFLAG    185487 non-null int64
HHINCOME    185487 non-null int64
WTSUPP      185487 non-null float64
dtypes: float64(2), int64(4)
memory usage: 8.5 MB

In [8]:
list(resp)


Out[8]:
['YEAR', 'SERIAL', 'HWTSUPP', 'ASECFLAG', 'HHINCOME', 'WTSUPP']

In [9]:
resp.head()


Out[9]:
YEAR SERIAL HWTSUPP ASECFLAG HHINCOME WTSUPP
0 2016 2 1132.05 1 13309 1132.05
1 2016 7 1077.54 1 29296 1077.54
2 2016 7 1077.54 1 29296 1077.54
3 2016 8 1151.49 1 24099 1151.49
4 2016 9 705.00 1 6300 705.00

In [33]:
percent = 0.01
n  = percent * len(resp)
int(n)


Out[33]:
1854

In [35]:
top01 = resp.nlargest(int(n), 'HHINCOME')
top01.HHINCOME.min(), top01.HHINCOME.max()


Out[35]:
(441050, 2289913)

In [11]:
np.percentile(resp.HHINCOME, 12)


Out[11]:
20000.0

In [15]:
np.percentile(resp.HHINCOME, 24)


Out[15]:
34315.0

In [16]:
np.percentile(resp.HHINCOME, 54)


Out[16]:
75010.0

In [17]:
np.percentile(resp.HHINCOME, 84)


Out[17]:
146892.67999999993

In [18]:
np.percentile(resp.HHINCOME, 99)


Out[18]:
440857.35999999661

In [ ]: